********************************************************************************
* Ranks employers using PageRank.
*
********************************************************************************
*** prepare
* start log
cap log close pageranks
log using "${LOG_DIR}/log_pageranks_${ext}.log", text name(pageranks) replace

* save data with possibly ${categ_var}-specific estimates
if $pagerank_by_categ local categ_ext = "_${categ_var}"
else local categ_ext = ""

* load data from default or with numeric extension
if $akm_default local ext_str_load = ""
else local ext_str_load = "_${ext}"

* save data as default or with numeric extension
if $pagerank_default local ext_str_save = ""
else local ext_str_save = "_${ext}"

* choose which category(s) to loop through
if ${pagerank_by_categ}==0 local xxx = 0
if inrange(real("`xxx'"), 0, 2) global g_list = "`xxx'"
else if $${categ_var}_min == $${categ_var}_max global g_list = "0 $${categ_var}_min"
// else global g_list = "0 $${categ_var}_min $${categ_var}_max" // Note: commented this out for now -- may need pooled ranks later?
else global g_list = "$${categ_var}_min $${categ_var}_max"


*** load data
foreach g of global g_list { // 0 = both categories, 1 = men/nohsch only, 2 = women/hsch only
	disp "*** CATEGORY = `g'"
	
	* load relevant data
	if $pagerank_monthly { // if using monthly flows data
		
		forval b = 1/$n_batches {
			disp "--> batch = `b'"

			disp "* load connected set data"
			do "${DO_DIR}/FUN_LOAD.do" $year_min $year_max "${vars_list} hire_month sep_month id_unique" // variable list "${vars_list}" defined in VALUE_0_MASTER.do
			//unique job identifier across years
			egen double xxx = group(year id_unique)
			cap recast double id_unique
			replace id_unique = xxx
			drop xxx
		
			disp "* keep only relevant data batch"
			if $n_batches > 1 keep if mod(persid, ${n_batches}) == `b' - 1
			
			disp "* rename variables"
			rename ${income_var} inc
			
			disp "* keep only relevant variables"
			if `g' == 0 keep year ${empid_var} persid hours_year inc hire_month sep_month id_unique // i.e., drop all other variables contained in variable list ${vars_list}
			else if inlist(`g', 1, 2) {
				keep year ${empid_var} persid hours_year inc hire_month sep_month id_unique ${categ_var} // i.e., drop all other variables contained in variable list ${vars_list}
				keep if ${categ_var} == `g'
				drop ${categ_var}
			}
			
			disp "* construct earnings in each month the job is active"
			recode sep_month (0=13)
			forval m = 1/12 {
				gen float inc`m' = inc if inrange(`m', hire_month, sep_month)
				compress inc`m'
			}
			drop hire_month sep_month
			
			disp "* reshape data to long format"
			bys persid ${empid_var} year (inc): gen double n = _n
			compress n
			drop inc
			reshape long inc, i(persid ${empid_var} year n) j(month) // Note: could use ${gtools}, but risks running into error, since "A Stata bug prevents gtools from working with more than 2,147,483,647 observations."
			drop n
			compress
			
			disp "* create variable containing year-month combination"
			gen int year_month = ym(year, month)
			drop year month
			format year_month %tm
			compress year_month
			
			disp "* keep only required variables"
			keep year_month ${empid_var} persid `var_categ' id_unique // i.e., drop hours_year inc rand

			disp "* restrict to unique job IDs in connected set for AKM (subject to selection criteria imposed in VALUE_1_CONNECTED.do)"
			do "${DO_DIR}/FUN_MERGE_CONNECTED.do" 6 // input arguments: `1' = 1: merge with connected set of firm IDs used for AKM estimation; `1' = 2: merge with connected set of firm IDs used for PageRank estimation; `1' = 3: merge with both connected sets of firm IDs; `1' = 4: merge with both connected setes of firm IDs in one step (faster); `1' = 5: merge with connected set of unique observation IDs (worker-firm-year-job) used for AKM estimation; `1' = 6: merge with connected set of unique observation IDs (worker-firm-year-job) used for PageRank estimation.
			drop id_unique
		
			disp "* save batch"
			if $n_batches > 1 saveold "${TEMP_DIR}/temp_pagerank_batch_`b'_g`g'.dta", version(${saveold_v}) replace
		}
		disp "* append data batches"
		if $n_batches > 1 {
			clear
			forval b = 1/$n_batches {
				append using "${TEMP_DIR}/temp_pagerank_batch_`b'_g`g'.dta", keep(`var_list')
				rm "${TEMP_DIR}/temp_pagerank_batch_`b'_g`g'.dta" // Note: this is used again further down!
			}
		}
	}
	else { // if using annual flows data
		disp "* load connected set data"
		do "${DO_DIR}/FUN_LOAD.do" $year_min $year_max "${vars_list} hire_month sep_month id_unique" // variable list "${vars_list}" defined in VALUE_0_MASTER.do
		drop hire_month sep_month
		// unique job identifier across years
		egen double xxx = group(year id_unique)
		cap recast double id_unique
		replace id_unique = xxx
		drop xxx
		
		disp "* rename variables"
		rename ${income_var} inc
		
		disp "* keep only relevant variables"
		if `g' == 0 keep year ${empid_var} persid hours_year inc id_unique // i.e., drop all other variables contained in variable list ${vars_list}
		else if inlist(`g', 1, 2) {
			keep year ${empid_var} persid hours_year inc id_unique ${categ_var} // i.e., drop all other variables contained in variable list ${vars_list}
			keep if ${categ_var} == `g'
			drop ${categ_var}
		}
		
		disp "* rename year variable"
		rename year year_month
		
		disp "* keep only required variables"
		keep year_month ${empid_var} persid id_unique // i.e., drop hours_year inc
		
		disp "* restrict to unique job IDs in connected set for AKM (subject to selection criteria imposed in VALUE_1_CONNECTED.do)"
		do "${DO_DIR}/FUN_MERGE_CONNECTED.do" 6 // input arguments: `1' = 1: merge with connected set of firm IDs used for AKM estimation; `1' = 2: merge with connected set of firm IDs used for PageRank estimation; `1' = 3: merge with both connected sets of firm IDs; `1' = 4: merge with both connected setes of firm IDs in one step (faster); `1' = 5: merge with connected set of unique observation IDs (worker-firm-year-job) used for AKM estimation; `1' = 6: merge with connected set of unique observation IDs (worker-firm-year-job) used for PageRank estimation.
		drop id_unique
	}
	
	disp "* set panel"
	xtset persid year_month

	disp "* create child employer IDs (where workers flow to)"
	rename ${empid_var} child
	label var child "Child employer ID"

	disp "* create parent employer IDs (where workers flow from)"
	gen double parent = L.child
	label var parent "Parent employer ID"
	drop persid year_month

	disp "* keep only nonmissing parent employer and child employer observations (treat U as exogenous in- and outflows)"
	keep if !inlist(., parent, child)
	//implies that establishments without any E-to-E in/outflow that is month-to-month are dropped

	disp "* PageRank option: count self-nodes?"
	if !${pagerank_self} keep if parent != child

	disp "* PageRank option: weight nodes?"
	if $pagerank_weight {
		gen byte one = 1
		${gtools}collapse (count) weight=one, by(parent child) fast // if intending to use importance weights for nodes
		label var weight "Importance weight (number of transitions)"
	}
	else bys parent child: keep if _n == 1 // else, if not intending to use importance weights for nodes

	disp "* save final output to be read by MATLAB"
	sort parent child
	format parent child %14.0f
	compress
	if $pagerank_weight {
		order parent child weight
		export delim parent child weight using "${TEMP_DIR}/pagerank_input_${ext}.csv", novarnames nolabel datafmt delim(tab) replace
	}
	else {
		order parent child
		export delim parent child using "${TEMP_DIR}/pagerank_input_${ext}.csv", novarnames nolabel datafmt delim(tab) replace
	}

	disp "* export parameters to be read from MATLAB"
	clear
	set obs 1
	gen byte pagerank_weight = ${pagerank_weight} // indicator for whether or not nodes should be weighted. 0 = unweighted; 1 = weighted.
	gen float pagerank_damping = ${pagerank_damping} // PageRank damping factor (between 0.00 and 1.00), which is the probability that a random surfer clicks on a link on the current page, instead of continuing on another random page. Standard value in computer science = 0.85, but Sorkin (2018, QJE) sets = 1.00.
	gen double ext = ${ext}
	format pagerank_weight %1.0f
	format pagerank_damping %5.4f
	format ext %12.0f
	local parameters_exist = 1
	cap confirm file "${TEMP_DIR}/parameters_pagerank.csv"
	if !_rc disp as error "USER WARNING: Parameters file (${TEMP_DIR}/parameters_pagerank.csv) already exists -- entering sleep loop."
	while `parameters_exist' {
		cap confirm file "${TEMP_DIR}/parameters_pagerank.csv"
		local parameters_exist = !_rc
		if `parameters_exist' sleep 60000 // sleep for 60s
	}
	compress
	export delim pagerank_weight pagerank_damping ext using "${TEMP_DIR}/parameters_pagerank.csv", novarnames nolabel datafmt delim(tab)
	clear
	
	disp "* call MATLAB"
	!"${MATLABPATH}" -nojvm -r "run ${DO_DIR}/VALUE_3_PAGERANKS.m"
	cap confirm file "${TEMP_DIR}/stoppr_${ext}.txt"
	while _rc {
		sleep 10000
		cap confirm file "${TEMP_DIR}/stoppr_${ext}.txt"
	}
	rm "${TEMP_DIR}/stoppr_${ext}.txt"

	disp "* import MATLAB output"
	import delim using "${TEMP_DIR}/pagerank_output_${ext}.txt", varnames(1) asdouble clear
	
	disp "* delete old data files used in AKM estimation"
	rm "${TEMP_DIR}/pagerank_input_${ext}.csv"
	rm "${TEMP_DIR}/pagerank_output_${ext}.txt"
	
	disp "* rename MATLAB output to reflect contents"
	rename empid ${empid_var}
	label var ${empid_var} "Employer ID"
	label var pagerank "PageRank"
	
	disp "* save intermediate output"
	if $pagerank_by_categ & `g' saveold "${TEMP_DIR}/pageranks_g`g'_${year_min}_${year_max}_${ext}.dta", version(${saveold_v}) replace
}


*** create dataset comprising both categories
if $pagerank_by_categ {
	* append intermediate output
	clear
	gen byte ${categ_var} = .
	label var ${categ_var} "Category"
	label define gen_l 1 "Male/NoHSch" 2 "Female/HSch", replace
	label val ${categ_var} gen_l
	foreach g of global g_list {
		append using "${TEMP_DIR}/pageranks_g`g'_${year_min}_${year_max}_${ext}.dta"
		rm "${TEMP_DIR}/pageranks_g`g'_${year_min}_${year_max}_${ext}.dta"
		replace ${categ_var} = `g' if ${categ_var} == .
	}
}

* summarize ties in PageRanks
if $pagerank_by_categ bys ${categ_var} pagerank: gen long N = _N
else bys pagerank: gen long N = _N
if $pagerank_by_categ bys ${categ_var}: tab N, m
else tab N, m
drop N


* split ties in PageRanks by injecting a small amount of noise
if !$pagerank_ties {
	set seed 123
	replace pagerank = pagerank + (runiform() - .5)/10^13
	bys ${categ_var} pagerank: gen long N = _N
	bys ${categ_var}: tab N, m
	drop N
}

* save
if $pagerank_by_categ {
	sort ${categ_var} ${empid_var}
	order ${categ_var} ${empid_var} pagerank
}
else {
	sort ${empid_var}
	order ${empid_var} pagerank
}
compress
save "${TEMP_DIR}/pageranks`categ_ext'_${year_min}_${year_max}`ext_str_save'.dta", replace


*** final housekeeping
* close log
log close pageranks
